# Analyzing correlations between dependency share and macro variables
# Produces:
#  Correlation plot
#  Regression model
#  Diagnostic plots
here::i_am("R/CountryDependencyCorrelations.R")
library(here)
library(ggplot2)
library(WDI)
library(dplyr)
library(readr)
library(scales)
library(viridis)
library(patchwork)
library(ggrepel)
library(jsonlite)


source(here("R/DiagnosticPlot.R"))

redo_data <- TRUE
download_wb <- TRUE
start_year <- 2017
end_year <- 2017
macro_data_path <- here("data/tidy/macro_data.csv")

# Prepare data---------------
## Dependency shares---------

order_regions <- c(
  'East Asia & \nPacific', 
  'Europe & \nCentral Asia', 
  'Latin America & \nCaribbean', 
  'Middle East & \nNorth Africa', 
  'South Asia', 
  'Sub-Saharan Africa'
)
if (redo_data){
dependency_data <- read_csv(
  "data/tidy/country_vulnerability.csv", show_col_types = FALSE) |> 
  mutate(region = case_when(
    region == "Middle East & North Africa" ~ "Middle East & \nNorth Africa",
    region == 'East Asia & Pacific' ~ 'Latin America & \nCaribbean',
    region == 'Europe & Central Asia' ~ 'Latin America & \nCaribbean',
    region == 'Latin America & Caribbean' ~ 'Latin America & \nCaribbean',
    region == 'Sub-Saharan Africa' ~ 'Sub-Saharan \nAfrica',
    .default = region
  )) |> 
  filter(region != "Other") |> 
  mutate(region=factor(
    x=region, levels = rev(order_regions), ordered = TRUE))

## Get World Bank Data------
file_path_wb <- here("data/raw/wb_data.csv")

if (download_wb){
  cat("Downloading World Bank data...\n")
  
  # Define indicators
  indicators <- c(
    "gdp_per_capita_ppp" = "NY.GDP.PCAP.PP.KD",
    "population" = "SP.POP.TOTL", 
    "manufacturing_va_pct_gdp" = "NV.IND.MANF.ZS",
    "total_natural_resources_rents_pct_gdp" = "NY.GDP.TOTL.RT.ZS",
    "trade_pct_gdp" = "NE.TRD.GNFS.ZS"
  )
  
  # Download data for 2017
  wb_data <- WDI(
    indicator = indicators,
    start = start_year,
    end = end_year,
    extra = TRUE
  )
  
  # Clean and rename
  wb_clean <- wb_data %>%
    dplyr::select(iso3c, year, all_of(names(indicators)), income)%>%
    filter(!is.na(iso3c)) 
  
  write_csv(x = wb_clean, file = file_path_wb)
  
  cat("Downloaded data for", nrow(wb_clean), "countries\n")
  cat("Saved downloaded data to: ", file_path_wb)
} else{
  read_csv(file = file_path_wb, show_col_types = FALSE)
  cat("Read data from: ", file_path_wb)
}


## Economic Complexity Data------------
# https://atlas.hks.harvard.edu/data-downloads
eci_raw <- read_csv(
  file = here("data/raw/growth_proj_eci_rankings.csv"), 
  show_col_types = FALSE
  ) |> 
  filter(year==2017) |> 
  dplyr::select(country_iso3_code, year, eci_hs92)

## Merge data-----------
dep_wb <- left_join(dependency_data, wb_clean, by = c("country"="iso3c"))
final_data <- left_join(dep_wb, eci_raw, by = c("country"="country_iso3_code", "year"))
write_csv(final_data, macro_data_path)
} else{
  final_data <- read_csv(file = macro_data_path, show_col_types = FALSE)
}

# Create plots-----
create_correlation_plots <- function(data) {
  cat("Creating correlation plots...\n")
  
  # Define common theme
  theme_correlation <- theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      plot.subtitle = element_text(size = 10, color = "grey60"),
      axis.title = element_text(size = 10),
      axis.text = element_text(size = 9),
      legend.position = "bottom",
      legend.title = element_text(size = 9),
      legend.text = element_text(size = 8),
      panel.grid.minor = element_blank(),
      strip.text = element_text(size = 10, face = "bold")
    )
  
  # Color palette for regions
  region_colors <- c(
    "Sub-Saharan \nAfrica" = "#E31A1C",
    "Latin America & \nCaribbean" = "#1F78B4", 
    "East Asia & \nPacific" = "#33A02C",
    "Middle East & \nNorth Africa" = "#FF7F00",
    "South Asia" = "#6A3D9A",
    "Europe & \nCentral Asia" = "#FB9A99",
    "Other" = "#CCCCCC"
  )
  
  # # Plot 1: ECI vs Dependency
  # p1 <- data %>%
  #   filter(!is.na(eci_hs92)) %>%
  #   ggplot(aes(x = eci_hs92, y = avg_dependency_pct, color = region)) +
  #   geom_point(size = 2.5, alpha = 0.8) +
  #   geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed", alpha = 0.3) +
  #   scale_color_manual(values = region_colors, name = "Region") +
  #   scale_y_continuous(labels = function(x) paste0(x, "%")) +
  #   labs(
  #     title = "Economic Sophistication vs. Trade Dependency",
  #     subtitle = "Countries with complex exports show lower dependency on Global North",
  #     x = "Economic Complexity Index (Higher = More Sophisticated)",
  #     y = "Average Dependency Share (%)",
  #     caption = "Source: Harvard Atlas of Economic Complexity, Authors' calculations"
  #   ) +
  #   theme_correlation
  
  # Replace Plot 1: Trade Openness vs Dependency (instead of ECI)
  p1 <- data %>%
    filter(!is.na(trade_pct_gdp)) %>%
    ggplot(aes(x = trade_pct_gdp, y = avg_dependency_pct, color = region)) +
    geom_point(size = 2.5, alpha = 0.8) +
    geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed", alpha = 0.3) +
    scale_color_manual(values = region_colors, name = "Region") +
    scale_x_continuous(labels = function(x) paste0(x, "%")) +
    scale_y_continuous(labels = function(x) paste0(x, "%")) +
    labs(
      title = "Trade Openness vs. Trade Dependency",
      subtitle = "More trade-open countries may show higher dependency on Global North",
      x = "Trade (% of GDP)",
      y = "Average Dependency Share (%)",
      caption = "Source: World Bank, Authors' calculations"
    ) +
    theme_correlation
  
  # Plot 2: GDP per capita vs Dependency  
  p2 <- data %>%
    mutate(log_gdp_per_capita=log(gdp_per_capita_ppp)) |> 
    filter(!is.na(gdp_per_capita_ppp)) %>%
    ggplot(aes(x = log_gdp_per_capita, y = avg_dependency_pct, color = region)) +
    geom_point(size = 2.5, alpha = 0.8) +
    geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed", alpha = 0.3) +
    scale_color_manual(values = region_colors, name = "Region") +
    scale_x_continuous(
      labels = function(x) paste0("$", comma(10^x, scale = 1e-3), "K"),
      breaks = log10(c(1000, 5000, 10000, 25000, 50000))
    ) +
    scale_y_continuous(labels = function(x) paste0(x, "%")) +
    labs(
      title = "Development Level vs. Trade Dependency", 
      subtitle = "Wealthier countries generally show lower dependency",
      x = "GDP per Capita (PPP, 2017 USD, log scale)",
      y = "Average Dependency Share (%)",
      caption = "Source: World Bank, Authors' calculations"
    ) +
    theme_correlation
  
  # Plot 3: Population vs Dependency
  p3 <- data %>%
    filter(!is.na(population)) %>%
    mutate(log_population = log(population)) |> 
    ggplot(aes(x = log_population, y = avg_dependency_pct, color = region)) +
    geom_point(size = 2.5, alpha = 0.8) +
    geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed", alpha = 0.3) +
    scale_color_manual(values = region_colors, name = "Region") +
    scale_x_continuous(
      labels = function(x) paste0(comma(10^x, scale = 1e-6), "M"),
      breaks = log10(c(1e6, 10e6, 100e6, 1e9))
    ) +
    scale_y_continuous(labels = function(x) paste0(x, "%")) +
    labs(
      title = "Country Size vs. Trade Dependency",
      subtitle = "Larger countries may have more diversified domestic markets", 
      x = "Population (log scale)",
      y = "Average Dependency Share (%)",
      caption = "Source: World Bank, Authors' calculations"
    ) +
    theme_correlation
  
  # Plot 4: Natural Resource Rents vs Dependency
  p4 <- data %>%
    filter(!is.na(total_natural_resources_rents_pct_gdp)) %>%
    ggplot(aes(x = total_natural_resources_rents_pct_gdp, y = avg_dependency_pct, color = region)) +
    geom_point(size = 2.5, alpha = 0.8) +
    geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed", alpha = 0.3) +
    scale_color_manual(values = region_colors, name = "Region") +
    scale_x_continuous(labels = function(x) paste0(x, "%")) +
    scale_y_continuous(labels = function(x) paste0(x, "%")) +
    labs(
      title = "Resource Dependence vs. Trade Dependency",
      subtitle = "Resource-rich economies show higher dependency patterns",
      x = "Natural Resource Rents (% of GDP)",
      y = "Average Dependency Share (%)", 
      caption = "Source: World Bank, Authors' calculations"
    ) +
    theme_correlation
  cat("...finished!")
  return(list(p1 = p1, p2 = p2, p3 = p3, p4 = p4))
}

plots <- create_correlation_plots(data = final_data)

combined_plot <- (plots$p1 + plots$p2) / (plots$p3 + plots$p4) +
  plot_layout(guides = "collect") &
  theme(legend.position = "bottom")
combined_plot

ggsave(
  filename = here("output/MacroCorrelationPlot.png"), plot = combined_plot,
  width = 12, height = 8, dpi = 300)

# Regressions--------
lm_model_with_eci <- lm(
  avg_dependency_pct ~ eci_hs92 + log(gdp_per_capita_ppp) + 
    log(population) + total_natural_resources_rents_pct_gdp, data = final_data)

lm_model <- lm(
  avg_dependency_pct ~ trade_pct_gdp + log(gdp_per_capita_ppp) + 
    log(population) + total_natural_resources_rents_pct_gdp, data = final_data)

summary(lm_model_with_eci)
summary(lm_model)

## Extract model numbers----

# Extract model results
model_results <- list(
  # Coefficients table
  coefficients = data.frame(
    variable = rownames(summary(lm_model)$coefficients),
    estimate = summary(lm_model)$coefficients[, "Estimate"],
    std_error = summary(lm_model)$coefficients[, "Std. Error"],
    t_value = summary(lm_model)$coefficients[, "t value"],
    p_value = summary(lm_model)$coefficients[, "Pr(>|t|)"],
    stringsAsFactors = FALSE
  ),
  
  # Model statistics
  r_squared = summary(lm_model)$r.squared,
  adj_r_squared = summary(lm_model)$adj.r.squared,
  f_statistic = summary(lm_model)$fstatistic[1],
  f_df1 = summary(lm_model)$fstatistic[2],
  f_df2 = summary(lm_model)$fstatistic[3],
  f_p_value = pf(summary(lm_model)$fstatistic[1], 
                 summary(lm_model)$fstatistic[2], 
                 summary(lm_model)$fstatistic[3], 
                 lower.tail = FALSE),
  residual_se = summary(lm_model)$sigma,
  df_residual = summary(lm_model)$df[2],
  n_obs = length(lm_model$fitted.values)
)

# Add significance stars
add_stars <- function(p_values) {
  stars <- ifelse(p_values < 0.001, "***",
                  ifelse(p_values < 0.01, "**",
                         ifelse(p_values < 0.05, "*",
                                ifelse(p_values < 0.1, ".", ""))))
  return(stars)
}

model_results$coefficients$stars <- add_stars(model_results$coefficients$p_value)

# Save to JSON for easy reading in Python
write_json(model_results, "output/regression_results.json", pretty = TRUE, digits = 6, auto_unbox = TRUE)

# Regression diagnostic plots-----
diagnostic_plots_w_eci <- create_diagnostic_plots(
  lm_model_with_eci, final_data)
diagnostic_plots_no_eci <- create_diagnostic_plots(
  lm_model, final_data)

# Combine diagnostic plots
diagnostic_plots_w_eci <- (diagnostic_plots_w_eci$p1 + diagnostic_plots_w_eci$p2) / 
  (diagnostic_plots_w_eci$p3 + diagnostic_plots_w_eci$p4) +
  plot_layout(guides = "collect") +
  plot_annotation(
    title = 'Regression with ECI',
    subtitle = "DEP = ECI+TradeOpen+log(GDPpc)+log(Population)+RessourceRents"
  )

diagnostic_plots_no_eci <- (diagnostic_plots_no_eci$p1 + diagnostic_plots_no_eci$p2) / 
  (diagnostic_plots_no_eci$p3 + diagnostic_plots_no_eci$p4) +
  plot_layout(guides = "collect") +
  plot_annotation(
    title = 'Diagnostic Plots: Baseline OLS Model',
    subtitle = "DEP = TradeOpen+log(GDPpc)+log(Population)+RessourceRents"
  )

ggsave(
  filename = here("output/diagnostics_with_eci.png"), 
  plot = diagnostic_plots_w_eci, width = 8, height = 6, dpi = 300)
ggsave(
  filename = here("output/diagnostics_no_eci.png"), 
  plot = diagnostic_plots_no_eci, width = 8, height = 6, dpi = 300)

